source : https://www.kaggle.com/datasets/sanjeetsinghnaik/fifa-23-players-dataset¶

Part Ⅰ :¶

In [1]:
from pyforest import *
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import sidetable
import math
import folium
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import geopandas as gpd
import emoji
import warnings
warnings.filterwarnings('ignore')
In [2]:
fifa = pd.read_csv("https://raw.githubusercontent.com/EylonYehiel/projects/main/Data/Fifa%2023%20Players%20Data.csv")
In [3]:
fifa.sample(5)
Out[3]:
Known As Full Name Overall Potential Value(in Euro) Positions Played Best Position Nationality Image Link Age ... LM Rating CM Rating RM Rating LWB Rating CDM Rating RWB Rating LB Rating CB Rating RB Rating GK Rating
12663 C. Forino Chris Forino 63 72 1000000 CB CB England https://cdn.sofifa.net/players/244/240/23_60.png 22 ... 45 47 45 57 59 57 59 65 59 15
2134 Germano Cuba Oswaldo Germano Cuba Correia 74 74 4200000 LB,CAM,LM LB Brazil https://cdn.sofifa.net/players/230/353/23_60.png 26 ... 74 73 74 74 71 74 74 67 74 21
6961 L. Acosta Lucas Acosta 68 72 1300000 GK GK Argentina https://cdn.sofifa.net/players/220/624/23_60.png 27 ... 29 26 29 31 29 31 31 32 31 69
9493 H. Heggheim Henrik Heggheim 66 76 1800000 CB CB Norway https://cdn.sofifa.net/players/257/072/23_60.png 21 ... 56 58 56 64 66 64 65 68 65 16
5121 L. Ruiz Leonardo Ruiz 70 72 2000000 ST ST Colombia https://cdn.sofifa.net/players/231/868/23_60.png 26 ... 68 61 68 49 46 49 46 42 46 20

5 rows × 89 columns

In [4]:
fifa.shape
Out[4]:
(18539, 89)
In [5]:
fifa.stb.missing().sum()
Out[5]:
missing          0.0
total      1649971.0
percent          0.0
dtype: float64
In [6]:
fifa.columns
Out[6]:
Index(['Known As', 'Full Name', 'Overall', 'Potential', 'Value(in Euro)',
       'Positions Played', 'Best Position', 'Nationality', 'Image Link', 'Age',
       'Height(in cm)', 'Weight(in kg)', 'TotalStats', 'BaseStats',
       'Club Name', 'Wage(in Euro)', 'Release Clause', 'Club Position',
       'Contract Until', 'Club Jersey Number', 'Joined On', 'On Loan',
       'Preferred Foot', 'Weak Foot Rating', 'Skill Moves',
       'International Reputation', 'National Team Name',
       'National Team Image Link', 'National Team Position',
       'National Team Jersey Number', 'Attacking Work Rate',
       'Defensive Work Rate', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Volleys',
       'Dribbling', 'Curve', 'Freekick Accuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'Sprint Speed', 'Agility', 'Reactions', 'Balance',
       'Shot Power', 'Jumping', 'Stamina', 'Strength', 'Long Shots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'Standing Tackle', 'Sliding Tackle',
       'Goalkeeper Diving', 'Goalkeeper Handling', ' GoalkeeperKicking',
       'Goalkeeper Positioning', 'Goalkeeper Reflexes', 'ST Rating',
       'LW Rating', 'LF Rating', 'CF Rating', 'RF Rating', 'RW Rating',
       'CAM Rating', 'LM Rating', 'CM Rating', 'RM Rating', 'LWB Rating',
       'CDM Rating', 'RWB Rating', 'LB Rating', 'CB Rating', 'RB Rating',
       'GK Rating'],
      dtype='object')
In [7]:
fifa.describe()
Out[7]:
Overall Potential Value(in Euro) Age Height(in cm) Weight(in kg) TotalStats BaseStats Wage(in Euro) Release Clause ... LM Rating CM Rating RM Rating LWB Rating CDM Rating RWB Rating LB Rating CB Rating RB Rating GK Rating
count 18539.000000 18539.000000 1.853900e+04 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 1.853900e+04 ... 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000 18539.000000
mean 65.852042 71.016668 2.875461e+06 25.240412 181.550839 75.173904 1602.114569 357.946221 8824.537462 5.081688e+06 ... 58.451319 57.374076 58.451319 56.281569 55.928583 56.281569 55.650251 54.528184 55.650251 23.257134
std 6.788353 6.192866 7.635129e+06 4.718163 6.858097 7.013593 273.160237 39.628259 19460.531154 1.467203e+07 ... 13.987122 13.171194 13.987122 13.903836 13.872190 13.903836 14.159466 14.743929 14.159466 15.108925
min 47.000000 48.000000 0.000000e+00 16.000000 155.000000 49.000000 759.000000 224.000000 0.000000 0.000000e+00 ... 18.000000 18.000000 18.000000 17.000000 19.000000 17.000000 17.000000 18.000000 17.000000 10.000000
25% 62.000000 67.000000 4.750000e+05 21.000000 177.000000 70.000000 1470.000000 331.000000 1000.000000 6.650000e+05 ... 54.000000 53.000000 54.000000 51.000000 48.000000 51.000000 49.000000 45.000000 49.000000 17.000000
50% 66.000000 71.000000 1.000000e+06 25.000000 182.000000 75.000000 1640.000000 358.000000 3000.000000 1.500000e+06 ... 62.000000 60.000000 62.000000 59.000000 59.000000 59.000000 59.000000 58.000000 59.000000 18.000000
75% 70.000000 75.000000 2.000000e+06 29.000000 186.000000 80.000000 1786.000000 385.000000 8000.000000 3.400000e+06 ... 67.000000 66.000000 67.000000 66.000000 66.000000 66.000000 65.000000 66.000000 65.000000 20.000000
max 91.000000 95.000000 1.905000e+08 44.000000 206.000000 105.000000 2312.000000 502.000000 450000.000000 3.667000e+08 ... 92.000000 91.000000 92.000000 88.000000 89.000000 88.000000 87.000000 90.000000 87.000000 90.000000

8 rows × 71 columns


In [8]:
def bold(text):
    return ("\033[1m" + f"{text}" + "\033[0m")

Basic-Features Distributions¶

In [9]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=fifa['Age'], name='Age', xbins = dict(size=1)))
fig.add_trace(go.Histogram(x=fifa['Height(in cm)'], name='Height (cm)'))
fig.add_trace(go.Histogram(x=fifa['Weight(in kg)'], name='Weight (kg)'))
fig.update_layout(height=600, width=1100, title='Histograms of Attributes :', xaxis_title_text='Attribute Value', yaxis_title_text='Frequency')

In [10]:
foot = fifa['Preferred Foot'].value_counts()
trace1 = go.Pie(labels=['Right','Left'],values=[foot[0],foot[1]],
    hoverinfo='label+value', textinfo='percent', textfont=dict(size=18),
    marker=dict(colors=['#DC143C','#FFA500'],line=dict(color='#FFFFFF', width=4)), pull=[0, 0.1])

left_percentage = 11.5      # Average percantage of left-handed people (Wikipedia)
right_percentage = 88.5
trace2 = go.Pie(labels=['Right', 'Left'], values=[right_percentage, left_percentage],
    hoverinfo='label+percent', textinfo='percent', textfont=dict(size=18),
    marker=dict(colors=['#DC143C', '#FFA500'], line=dict(color='#FFFFFF', width=4)), pull=[0, 0.1])
In [11]:
fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]], subplot_titles=['Population', 'Footballers'])
fig.add_trace(trace1, 1, 2)
fig.add_trace(trace2, 1, 1)
fig.update_layout(title_text='Strong side (Right or Left) - football players VS whole population :', height = 600, width = 1000)
fig.show()
  • Assuming being left-handed means left with legs either. #### It can be seen that the percentage of left-strong-sided among football players is more than twice the percentage of left-handed people in the general population. Coincidence?

By Positions :¶

In [12]:
fig = px.histogram(fifa, x="Age", color="Best Position", nbins=55)
fig.update_layout(height=450, width=1100, title='Ages distribution by positions', yaxis_title_text='Frequency')
fig.show()

fig = px.histogram(fifa, x="Weight(in kg)", color="Best Position", nbins=55)
fig.update_layout(height=450, width=1100, title='Weight distribution by positions', yaxis_title_text='Frequency')
fig.show()

fig = px.histogram(fifa, x="Height(in cm)", color="Best Position", nbins=50)
fig.update_layout(height=450, width=1100, title='Height distribution by positions', yaxis_title_text='Frequency')
fig.show()

Features heatmap :¶

In [13]:
fig = px.imshow(fifa[['Overall','Reactions','Composure','Passing Total','Wage(in Euro)','Value(in Euro)','Shot Power','Age']].corr(), color_continuous_scale='OrRd', zmin=0, zmax=1)
fig.update_layout(width=800, height=600, title='Strong Connnections Zoom-in:')
fig.show()

Random forest model to predict player's overall rank by his reactions, composure and passing scores, his wage and his age :¶

In [14]:
features = ['Reactions', 'Composure','Passing Total', 'Wage(in Euro)', 'Age'] 
target = 'Overall' 
X_train, X_test, y_train, y_test = train_test_split(fifa[features], fifa[target], test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Test:¶

In [15]:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2_score = model.score(X_test, y_test)
print("RMSE: " + bold(rmse))
print("r^2:  " + bold(r2_score))
RMSE: 2.397600706049064
r^2:  0.8728423345934998
pep
In [16]:
importances = model.feature_importances_
print("Feature Importances:")
for feature, importance in zip(features, importances):
    print(f"{feature}: {importance}")
Feature Importances:
Reactions: 0.7533940990013311
Composure: 0.047712616582223796
Passing Total: 0.07441100885822209
Wage(in Euro): 0.08892444372035126
Age: 0.03555783183787177
In [17]:
fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Score', 'y': 'Predicted Score'}, title='Actual vs. Predicted Overall Ratings:',
                 trendline='ols')
fig.update_layout(height = 500)
fig.show()

Create prediction function:¶

In [18]:
def pred(Reactions, Composure, Pass, Wage, Age):
    p = model.predict([[Reactions,Composure,Pass,Wage,Age]])
    print(f"Overall score for {Reactions, Composure, Pass, Wage, Age} values is : {float(p)}")
In [19]:
pred(81,64,90,50000,28)
pred(78,86,70,100000,30)
pred(94,90,60,700000,21)
pred(94,80,80,60000,22)
Overall score for (81, 64, 90, 50000, 28) values is : 83.17
Overall score for (78, 86, 70, 100000, 30) values is : 79.7
Overall score for (94, 90, 60, 700000, 21) values is : 90.52
Overall score for (94, 80, 80, 60000, 22) values is : 85.2

In [20]:
general_features = ['Full Name', 'Age', 'Height(in cm)', 'Weight(in kg)', 'Nationality','Overall','Potential',
                    'Value(in Euro)', 'Wage(in Euro)', 'Sprint Speed', 'Defending Total', 'Dribbling Total', 
                    'Pace Total', 'Shooting Total', 'Passing Total', 'Physicality Total', 'Best Position',
                    'Club Name','Preferred Foot','BaseStats','TotalStats','International Reputation','Weak Foot Rating']

Top field players :¶

In [21]:
fields = fifa[general_features][fifa['Best Position'] != 'GK']
fields.sort_values(['Overall','TotalStats','Value(in Euro)'], ascending = False).head(10)
Out[21]:
Full Name Age Height(in cm) Weight(in kg) Nationality Overall Potential Value(in Euro) Wage(in Euro) Sprint Speed ... Shooting Total Passing Total Physicality Total Best Position Club Name Preferred Foot BaseStats TotalStats International Reputation Weak Foot Rating
3 Kevin De Bruyne 31 181 70 Belgium 91 91 107500000 350000 73 ... 88 93 77 CM Manchester City Right 483 2303 4 5
2 Robert Lewandowski 33 185 81 Poland 91 91 84000000 420000 75 ... 91 79 83 ST FC Barcelona Right 458 2205 5 4
0 Lionel Messi 35 169 67 Argentina 91 91 54000000 195000 76 ... 89 90 64 CAM Paris Saint-Germain Left 452 2190 5 4
4 Kylian Mbappé 23 182 73 France 91 95 190500000 230000 97 ... 89 80 76 ST Paris Saint-Germain Right 470 2177 4 4
1 Karim Benzema 34 185 81 France 91 91 64000000 450000 80 ... 88 83 78 CF Real Madrid CF Right 455 2147 4 4
5 Mohamed Salah 30 175 71 Egypt 90 90 115500000 270000 91 ... 89 82 75 RW Liverpool Left 471 2226 4 3
8 C. Ronaldo dos Santos Aveiro 37 187 83 Portugal 90 90 41000000 220000 83 ... 92 78 75 ST Manchester United Right 445 2159 5 4
9 Virgil van Dijk 30 193 92 Netherlands 90 90 98000000 230000 91 ... 60 71 86 CB Liverpool Right 461 2117 4 3
17 Joshua Kimmich 27 177 75 Germany 89 90 105500000 130000 60 ... 72 87 79 CDM FC Bayern München Right 473 2283 4 4
13 Carlos Henrique Venancio Casimiro 30 185 84 Brazil 89 89 86000000 240000 66 ... 73 75 90 CDM Manchester United Right 460 2209 3 3

10 rows × 23 columns

Top goalies :¶

In [22]:
goalies = fifa[fifa['Best Position'] == 'GK']
goalies[general_features].sort_values(['Overall','TotalStats','Value(in Euro)'], ascending = False).head(10)
Out[22]:
Full Name Age Height(in cm) Weight(in kg) Nationality Overall Potential Value(in Euro) Wage(in Euro) Sprint Speed ... Shooting Total Passing Total Physicality Total Best Position Club Name Preferred Foot BaseStats TotalStats International Reputation Weak Foot Rating
7 Manuel Neuer 36 193 93 Germany 90 90 13500000 72000 60 ... 88 91 91 GK FC Bayern München Right 501 1535 5 4
6 Thibaut Courtois 30 199 96 Belgium 90 91 90000000 250000 52 ... 89 75 89 GK Real Madrid CF Left 473 1334 4 3
16 Ederson Santana de Moraes 28 188 86 Brazil 89 91 88000000 210000 63 ... 82 93 88 GK Manchester City Left 502 1583 3 3
18 Alisson Ramses Becker 29 191 91 Brazil 89 90 79000000 190000 49 ... 85 85 90 GK Liverpool Right 489 1437 3 3
14 Jan Oblak 29 188 87 Slovenia 89 91 85500000 100000 58 ... 90 78 87 GK Atlético de Madrid Right 479 1402 5 3
27 Marc-André ter Stegen 30 187 85 Germany 88 89 68500000 210000 50 ... 85 87 85 GK FC Barcelona Right 480 1443 4 4
26 Keylor Navas 35 185 80 Costa Rica 88 88 10000000 85000 53 ... 84 75 87 GK Paris Saint-Germain Right 478 1428 3 3
22 Gianluigi Donnarumma 23 196 90 Italy 88 92 103500000 110000 55 ... 83 79 85 GK Paris Saint-Germain Right 478 1375 3 3
37 Mike Maignan 26 191 89 France 87 90 80000000 90000 53 ... 82 85 85 GK AC Milan Right 477 1496 2 4
35 David De Gea Quintana 31 192 76 Spain 87 87 42000000 150000 50 ... 80 76 84 GK Manchester United Right 468 1415 4 3

10 rows × 23 columns


From which country do most players come from?¶

  • Define the best players by their : Overall rating -> Total Statsistics -> Value.
In [23]:
best = fifa.sort_values(['Overall','TotalStats','Value(in Euro)'], ascending = False)
x = best.Nationality.value_counts().head(20)
colors = ['red', 'gold', 'orange', 'blue', 'turquoise', 'green', 'indigo', 'purple', 'brown', 'teal']
fig = go.Figure([go.Bar(x=x.index, y=x.values, marker_color=colors)])
fig.update_layout(width=900, height=500, title='Top 20 origin nationalities :' , xaxis_title='Nationality', yaxis_title='Number of Players')
fig.show()

Import coordinates for countries display :¶

In [24]:
coordinates = pd.read_csv("https://raw.githubusercontent.com/EylonYehiel/projects/main/Data/world_and_usa_states_coordinated.csv")
coordinates = coordinates[['country', 'latitude', 'longitude']].rename(columns={'country':'Nationality'})
coordinates.iloc[73,0] = 'England' # Adjust the several names of England int one
cofifa = fifa.merge(coordinates, on = 'Nationality')
cofifa = cofifa[['Full Name','Nationality','latitude', 'longitude','Overall', 'Value(in Euro)', 'Age', 'Club Name','International Reputation']]
cofifa.sample(7)
Out[24]:
Full Name Nationality latitude longitude Overall Value(in Euro) Age Club Name International Reputation
5000 Curtis Nelson England 55.378051 -3.435973 68 1000000 29 Cardiff City 1
15210 Matías Rojas Paraguay -23.442503 -58.443832 73 3300000 26 Racing Club 1
9293 Martí Vilà García Spain 40.463667 -3.749220 62 850000 23 FC Andorra 1
2177 Adam Chrzanowski Poland 51.919438 19.145136 61 675000 23 Wisła Płock 1
14079 Seifedin Chabbi Tunisia 33.886917 9.537499 66 875000 28 SV Ried 1
379 Axel Werner Argentina -38.416097 -63.616672 70 1800000 26 Elche CF 1
10210 Marco Burch Switzerland 46.818188 8.227512 66 2000000 21 FC Luzern 1

Players origin distribution on the map:¶

In [25]:
world_map = folium.Map(location=[0, 0], zoom_start=1.5, width=850, height=500)
mc = MarkerCluster()
for idx, row in cofifa.iterrows():
    if not math.isnan(row['longitude']) and not math.isnan(row['latitude']):
        mc.add_child(Marker([row['latitude'], row['longitude']], popup=f"<b>Full Name:</b>{row['Full Name']}\n<b>\nClub Name:</b> ({row['Club Name']})<br><b>Nationality:</b> {row['Nationality']}<br><b>Age:</b> {row['Age']}"))        
world_map.add_child(mc)
world_map
Out[25]:
Make this Notebook Trusted to load map: File -> Trust Notebook